1 The data

In smoking_status Unknown should be changed to NA.

Also, it can be ordered: never < formerly < smokes

ever_married can be recoded as 0/1 in accordance with heart_disease and hypertension

Other predictors seem to be OK

df <- read_csv("data/healthcare-dataset-stroke-data.csv", col_types = "cfdfffffddcf", na = c("Unknown", "N/A"))

# if you set smoking_status to factor in col_types, na() won't work
df$smoking_status <- as_factor(df$smoking_status)
df$smoking_status <- fct_relevel(df$smoking_status, c("never smoked", "formerly smoked", "smokes"))

# married
df$ever_married <- factor(if_else(df$ever_married == "Yes", 1, 0))

# for models working properly
df$stroke <- factor(ifelse(df$stroke == 1, "yes", "no"), levels = c("no", "yes"))

df

1.1 Descriptive statistics

Skip id column

df$id <- NULL
skimr::skim(df)
Data summary
Name df
Number of rows 5110
Number of columns 11
_______________________
Column type frequency:
factor 8
numeric 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
gender 0 1.0 FALSE 3 Fem: 2994, Mal: 2115, Oth: 1
hypertension 0 1.0 FALSE 2 0: 4612, 1: 498
heart_disease 0 1.0 FALSE 2 0: 4834, 1: 276
ever_married 0 1.0 FALSE 2 1: 3353, 0: 1757
work_type 0 1.0 FALSE 5 Pri: 2925, Sel: 819, chi: 687, Gov: 657
Residence_type 0 1.0 FALSE 2 Urb: 2596, Rur: 2514
smoking_status 1544 0.7 FALSE 3 nev: 1892, for: 885, smo: 789
stroke 0 1.0 FALSE 2 no: 4861, yes: 249

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1.00 43.23 22.61 0.08 25.00 45.00 61.00 82.00 ▅▆▇▇▆
avg_glucose_level 0 1.00 106.15 45.28 55.12 77.24 91.88 114.09 271.74 ▇▃▁▁▁
bmi 201 0.96 28.89 7.85 10.30 23.50 28.10 33.10 97.60 ▇▇▁▁▁

Target ‘stroke’ is imbalanced!

‘smoking_status’ completeness rate 0.7

1.2 How many smoking_status in each target class?

df %>% group_by(stroke, smoking_status) %>% summarise(N=n())

BMI’s complete rate 0.96

1.3 How many skipped BMI in each target class?

df %>% filter(is.na(bmi)) %>% group_by(stroke) %>% summarise(N=n())

One ‘Other’ gender to be removed

df <- df %>% filter(gender != "Other")

1.4 EDA

1.4.1 Overview: a pairs plot

GGally::ggpairs(df, aes(color = stroke, alpha = 0.2, dotsize = 0.02), 
        upper = list(continuous = GGally::wrap("cor", size = 2.5)),
        diag = list(continuous = "barDiag")) +
  scale_color_brewer(palette = "Set1", direction = -1) +
  scale_fill_brewer(palette = "Set1", direction = -1)

1.4.2 In details

1.4.2.1 Stroke vs Age

ggplot(df, aes(stroke, age)) +
  geom_boxplot(aes(fill = stroke), alpha = 0.5, varwidth = T, notch = T) +
  geom_violin(aes(fill = stroke), alpha = 0.5) +
  scale_fill_brewer(palette = "Set1", direction = -1) +
  xlab("")

OBS! There are observation with age much below 20 y.o., even close to 0!

These are very young kids or babies - should we even include them in the analysis?

If so, it will be prediction for adults. Also, stroke in kids probably has very different causes compared to stroke in adults/old folk.

1.4.2.2 Stroke vs Age + Gender

ggplot(df, aes(stroke, age)) + 
  geom_violin(alpha=0.3) +
  geom_jitter(alpha=0.2, size=0.8, width = 0.15, height = 0.1, aes(color = gender)) + 
  geom_boxplot(alpha = 0.2) +
  scale_color_brewer(palette = "Set2", direction = -1)

1.4.2.3 Stroke vs Glucose

ggplot(df, aes(stroke, avg_glucose_level)) +
  geom_boxplot(aes(fill = stroke), alpha = 0.5, varwidth = T, notch = T) +
  geom_violin(aes(fill = stroke), alpha = 0.5) +
  scale_fill_brewer(palette = "Set1", direction = -1) +
  xlab("")

This average glucose level is probably the results of fasting blood sugar test

If I correctly understand this CDC information on diabetes, values greater than 126 is evidence of diabetes. But 250? Is it realistic?

1.4.2.4 Stroke vs BMI

ggplot(df, aes(stroke, bmi)) +
  geom_boxplot(aes(fill = stroke), alpha = 0.5, varwidth = T, notch = T) +
  geom_violin(aes(fill = stroke), alpha = 0.5) +
  scale_fill_brewer(palette = "Set1", direction = -1) +
  xlab("")

BMI over 40 is the 3rd class of obesity - BMI over 75 should not exist at all.

Let’s look at this weird points

1.4.2.5 Age vs BMI vs Glucose

ggplot(df, aes(age, bmi)) +
  geom_point(aes(color = stroke), alpha = 0.8, size = 0.5) +
  scale_fill_brewer(palette = "Set1", direction = -1) +
  facet_grid(rows = vars(stroke)) +
  guides(color = "none")

Patients with BMI over 75 are also very young. Suspicious.

1.4.2.6 Glucose vs Age + smoking

ggplot(df, aes(age, avg_glucose_level)) +
  geom_point(aes(color = smoking_status), alpha = 0.6, size = 1) +
  scale_fill_brewer(palette = "Set1", direction = -1) +
  facet_grid(rows = vars(stroke)) +
  guides()

OBS! Kids are mainly ‘Unknown’ smoking status; both target groups are divided into two clusters – I’m curious why.

It is not gender, nor heart disease or any other factor we have in the data set.

1.4.2.7 Age vs Smoking

ggplot(df, aes(smoking_status, age)) +
  geom_boxplot(aes(fill = stroke), alpha = 0.5, varwidth = T, notch = T) +
  scale_fill_brewer(palette = "Set1", direction = -1) +
  xlab("")

Kids are mainly non-smokers of course. Note the same two outliers of age below 20 in stroke-yes

1.4.2.8 Glucose vs BMI

ggplot(df, aes(avg_glucose_level, bmi)) +
  geom_point(aes(color = age), alpha = 0.6, size = 1) +
  scale_fill_brewer(palette = "Set1", direction = -1) +
  facet_grid(rows = vars(stroke)) +
  guides()

BMI outliers: super high BMI but super low glucose levels? How’s that possible?

Can it be a bias introduced by testing protocol misuse? Like instead of measuring glucose before eating, in some samples it was done after eating?

1.4.2.9 Stroke vs Gender

gender <- df %>% group_by(stroke, gender) %>% summarize(N=n())

ggplot(gender, aes(stroke, N)) +
  geom_bar(aes(fill=gender), alpha = 0.8, stat = "identity", position = "fill") +
  scale_fill_brewer(palette = "Set2", direction = -1) +
  ylab("proportion")

Proportions in both stroke groups are roughly the same

1.4.2.10 Stroke vs Hypertension

hyptens <- df %>% group_by(stroke, hypertension) %>% summarize(N = n())

ggplot(hyptens, aes(stroke, N)) +
  geom_bar(aes(fill = hypertension), alpha = 0.8, stat = "identity", position = "fill") +
  scale_fill_brewer(palette = "Set2", direction = -1) +
  ylab("proportion")

Hypertension occurred more often in stroke-yes

1.4.2.11 Stroke vs Heart Disease

heart <- df %>% group_by(stroke, heart_disease) %>% summarize(N=n())

ggplot(heart, aes(stroke, N)) +
  geom_bar(aes(fill = heart_disease), alpha = 0.8, stat = "identity", position = "fill") +
  scale_fill_brewer(palette = "Set2", direction = 1) +
  ylab("proportion")

1.4.2.12 Stroke vs Ever Married

married <- df %>% group_by(stroke, ever_married) %>% summarize(N=n())

ggplot(married, aes(stroke, N)) +
  geom_bar(aes(fill = ever_married), alpha = 0.8, stat = "identity", position = "fill") +
  scale_fill_brewer(palette = "Set2", direction = -1) +
  ylab("proportion")

Marriage is bad :)

1.4.2.13 Stroke vs Work Type

work <- df %>% group_by(stroke, work_type) %>% summarize(N=n())

ggplot(work, aes(stroke, N)) +
  geom_bar(aes(fill = work_type), alpha = 0.8, stat = "identity", position = "fill") +
  scale_fill_brewer(palette = "Set2", direction = 1) +
  ylab("proportion")

It’s good to be a child

1.4.2.14 Stroke vs Residence Type

residence <- df %>% group_by(stroke, Residence_type) %>% summarize(N=n())

ggplot(residence, aes(stroke, N)) +
  geom_bar(aes(fill = Residence_type), alpha = 0.8, stat = "identity", position = "fill") +
  scale_fill_brewer(palette = "Set2", direction = 1) +
  ylab("proportion")

1.4.2.15 Stroke vs Smoking

smoking <- df %>% group_by(stroke, smoking_status) %>% summarize(N=n())

ggplot(smoking, aes(stroke, N)) +
  geom_bar(aes(fill = smoking_status), alpha = 0.8, stat = "identity", position = "fill") +
  scale_fill_brewer(palette = "Set2", direction = 1) +
  ylab("proportion")

1.4.3 Conclusions

There are several suspicious outliers (like in BMI and glucose). I can safely remove BMI > 75, maybe even BMI > 60 (Remember that BMI > 40 is the highest class of obesity).

What is less safe - it’s removing non-adults (younger than 20 y.o.). On one hand they provide valid information (age is very important predictor), on the other hand their stroke cases are really sus and a lot of predictors do not have sense (or are obvious NAs) for non-adults (like smoking, marriage status, employment type, residence type etc.); model-based imputation of smoking_status in children doesn’t have sense as well, I should rather replace with “never smoked”.

Since, modelling using all predictors and observations has given me very moderate results (TPR = 1 comes with very high FPR and very low probability cutoff close to 0), I will try various trimming of the data.

1.5 Trimming

Try ‘no kinds’ version

df_trim <- df %>% filter(bmi < 60, age > 20 )

skimr::skim(df_trim)
Data summary
Name df_trim
Number of rows 3896
Number of columns 11
_______________________
Column type frequency:
factor 8
numeric 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
gender 0 1.00 FALSE 2 Fem: 2383, Mal: 1513, Oth: 0
hypertension 0 1.00 FALSE 2 0: 3451, 1: 445
heart_disease 0 1.00 FALSE 2 0: 3654, 1: 242
ever_married 0 1.00 FALSE 2 1: 3186, 0: 710
work_type 0 1.00 FALSE 4 Pri: 2521, Sel: 757, Gov: 617, Nev: 1
Residence_type 0 1.00 FALSE 2 Urb: 1988, Rur: 1908
smoking_status 752 0.81 FALSE 3 nev: 1630, for: 807, smo: 707
stroke 0 1.00 FALSE 2 no: 3688, yes: 208

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 51.23 16.97 21.00 38.00 51.00 64.00 82.00 ▆▇▇▆▆
avg_glucose_level 0 1 108.16 47.60 55.12 77.22 92.21 115.98 271.74 ▇▃▁▁▁
bmi 0 1 30.49 6.93 11.30 25.60 29.40 34.30 59.70 ▁▇▅▁▁

BMI is complete, in total approx. 2000 observations are gone

1.6 Imputation

Using package mice

It uses polr - proportional odds model - for smoking_status and pmm - predictive mean matching - for bmi

1.6.1 Run imputation

library(mice)

imp_mice <- mice(df_trim)
## 
##  iter imp variable
##   1   1  smoking_status
##   1   2  smoking_status
##   1   3  smoking_status
##   1   4  smoking_status
##   1   5  smoking_status
##   2   1  smoking_status
##   2   2  smoking_status
##   2   3  smoking_status
##   2   4  smoking_status
##   2   5  smoking_status
##   3   1  smoking_status
##   3   2  smoking_status
##   3   3  smoking_status
##   3   4  smoking_status
##   3   5  smoking_status
##   4   1  smoking_status
##   4   2  smoking_status
##   4   3  smoking_status
##   4   4  smoking_status
##   4   5  smoking_status
##   5   1  smoking_status
##   5   2  smoking_status
##   5   3  smoking_status
##   5   4  smoking_status
##   5   5  smoking_status
df_imp <- complete(imp_mice)

Number of NAs in BMI: 0

Number of NAs in Smoking: 0

1.6.2 Check distributions

1.6.2.1 BMI

bmi_imp_comp <- bind_rows(select(df_trim, bmi, stroke) %>% mutate(type = rep("original", nrow(df_trim))),
          select(df_imp, bmi, stroke) %>% mutate(type = rep("imputed", nrow(df_imp))))

ggplot(bmi_imp_comp, aes(bmi)) +
  geom_histogram(aes(fill = type), alpha = 0.8) +
  facet_grid(cols = vars(stroke))

Means have not changed, which is good, I suppose.

1.6.2.2 Smoking

smoke_imp_comp <- bind_rows(select(df_trim, smoking_status, stroke) %>% mutate(type = rep("original", nrow(df_trim))),
          select(df_imp, smoking_status, stroke) %>% mutate(type = rep("imputed", nrow(df_imp))))

ggplot(smoke_imp_comp, aes(smoking_status)) +
  geom_bar(aes(fill=type), alpha=0.8, position="dodge") +
  facet_grid(cols = vars(stroke)) +
  xlab("")+
  theme(axis.text.x = element_text(angle=45, vjust = 0.5))

Counts increased proportionally in all Smoking groups

1.7 Scaling & Normalization

Scale numeric features (including imputed BMI)

# use caret::preProcess()
# preProcValues <- preProcess(training, method = c("center", "scale"))

df_scaled <- df_imp %>% 
  select(avg_glucose_level, age, bmi) %>% 
  scale() %>% 
  data.frame()

1.8 Make Dummies

I’ve decided to omit smoking_status completely - it won’t be dummified

# select vars
to_dum <- df_imp %>% select(gender, work_type, Residence_type, smoking_status)
# make an obj
dummies <- dummyVars(~ ., data = to_dum)
# apply it
df_dummy <- data.frame(predict(dummies, newdata = to_dum))

head(df_dummy)

1.9 Join scaled and dummies and the rest

df_proc <- bind_cols(df_scaled, df_dummy, select(df_trim, hypertension, heart_disease, ever_married, stroke))
head(df_proc)

2 Modelling

2.1 Params tuning

ROC-optimization is better when data is imbalanced

Kappa-optimization is also good

# for ROC
fit_ctrl_roc <- trainControl(## 5-fold CV
                           method = "repeatedcv",
                           number = 5,
                           repeats = 10, 
                           allowParallel = T,
                           classProbs = T,
                           summaryFunction = twoClassSummary)
# for kappa
fit_ctrl_kp <- trainControl(## 5-fold CV
                           method = "repeatedcv",
                           number = 5,
                           repeats = 10, 
                           allowParallel = T)

fit_ctrl_kp10 <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           repeats = 10, 
                           allowParallel = T)

2.2 Split data

Imbalanced data - use SMOTE to create training data set, but not testing data set

set.seed(1234)
sample_set <- createDataPartition(y = df_proc$stroke, p = .75, list = FALSE)
df_train <- df_proc[sample_set,]
df_test <- df_proc[-sample_set,]

# DMwR::SMOTE for imbalanced data: over=225 and under=150 give me 1:1 ratio
df_train_smote <- SMOTE(stroke ~ ., data.frame(df_train), perc.over = 1725, perc.under = 106)

df_train_smote %>% group_by(stroke) %>% summarise(N=n())

3 Random Forest

3.1 Training and validation

# start a cluster
library(doParallel)

cl <- makePSOCKcluster(THREADS)
registerDoParallel(cl)

3.1.1 Kappa-optimized

For imbalanced classes

set.seed(123)

fit_rf <- train(stroke ~ ., 
                 data = df_train_smote, 
                 metric = "Kappa", 
                 method = "rf", 
                 trControl = fit_ctrl_kp,
                 tuneGrid = expand.grid(.mtry = seq(2, 6, 0.5)), # I've tried all values greater than these
                 verbosity = 0,
                 verbose = FALSE)

fit_rf
## Random Forest 
## 
## 5619 samples
##   19 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 4495, 4495, 4495, 4495, 4496, 4494, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2.0   0.9608825  0.9217635
##   2.5   0.9605622  0.9211227
##   3.0   0.9656877  0.9313737
##   3.5   0.9675566  0.9351114
##   4.0   0.9674139  0.9348261
##   4.5   0.9671473  0.9342928
##   5.0   0.9681794  0.9363572
##   5.5   0.9683219  0.9366422
##   6.0   0.9682685  0.9365354
## 
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.5.

3.1.2 ROC-optimized

#cl <- makePSOCKcluster(THREADS)
#registerDoParallel(cl)

set.seed(120)

fit_rf_roc <- train(stroke ~ ., 
                 data = df_train_smote, 
                 metric = "ROC", 
                 method = "rf", 
                 trControl = fit_ctrl_roc,
                 tuneGrid = expand.grid(.mtry = seq(2, 6, 0.5)),
                 verbosity = 0,
                 verbose = FALSE)
#stopCluster(cl)

fit_rf_roc
## Random Forest 
## 
## 5619 samples
##   19 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 4495, 4496, 4495, 4496, 4494, 4496, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##   2.0   0.9871208  0.9795448  0.9412761
##   2.5   0.9869408  0.9798643  0.9411336
##   3.0   0.9894277  0.9890067  0.9412411
##   3.5   0.9905384  0.9911413  0.9431288
##   4.0   0.9904763  0.9910703  0.9425950
##   4.5   0.9905240  0.9912481  0.9427729
##   5.0   0.9910410  0.9913194  0.9441621
##   5.5   0.9912119  0.9903944  0.9454441
##   6.0   0.9911809  0.9905368  0.9449454
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.5.

3.2 Features importance

3.2.1 Kappa-optimized model

imp_vars_rf <- varImp(fit_rf)

plot(imp_vars_rf, main = "Variable Importance with RF")

3.2.2 ROC-optimized model

it’s the same

3.3 Testing

3.3.1 ROC & AUC

a function for roc-stuff

get_roc <- function(fit.obj, testing.df){
  pred_prob <- predict.train(fit.obj, newdata = testing.df, type = "prob")
  pred_roc <- prediction(predictions = pred_prob$yes, labels = testing.df$stroke)
  perf_roc <- performance(pred_roc, measure = "tpr", x.measure = "fpr")
  return(list(perf_roc, pred_roc))
}

3.3.1.1 ROC-curve for kappa-optimized model

# calculate ROC
perf_pred <- get_roc(fit_rf, df_test)
perf_rf <- perf_pred[[1]]
pred_rf <- perf_pred[[2]]

# take AUC 
auc_rf <- round(unlist(slot(performance(pred_rf, measure = "auc"), "y.values")), 3)

# plot
plot(perf_rf, main = "RF-k ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_rf))

3.3.1.2 ROC-curve for ROC-optimized model

# calculate ROC
perf_pred_roc <- get_roc(fit_rf_roc, df_test)
perf_rf_roc <- perf_pred_roc[[1]]
pred_rf_roc <- perf_pred_roc[[2]]

# take AUC 
auc_rf_roc <- round(unlist(slot(performance(pred_rf_roc, measure = "auc"), "y.values")), 3)

# plot
plot(perf_rf_roc, main = "RF-r ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_rf_roc))

So, we can adjust TPR/FPR cutoff to predict all strokes

3.3.2 TPR, FPR vs Probability cutoff

At which probability cut-off, you’ll get TPR = 1.0?

# use pred_rf (pred_roc) object
plot(performance(pred_rf, measure = "tpr", x.measure = "cutoff"),
     col="steelblue", 
     ylab = "Rate", 
     xlab="Probability cutoff")

plot(performance(pred_rf, measure = "fpr", x.measure = "cutoff"), 
     add = T, col = "red")

legend(x = 0.6,y = 0.7, c("TPR (Recall)", "FPR (1-Spec)"), 
       lty = 1, col =c('steelblue', 'red'), bty = 'n', cex = 1, lwd = 2)

#abline(v = 0.02, lwd = 2, lty=6)

title("RF-k")

Vertical line at cutoff = 0.02 designates maximum TPR and maximum FPR. Ideal cutoff should be to the left of this line

# use pred_rf (pred_roc) object
plot(performance(pred_rf_roc, measure = "tpr", x.measure = "cutoff"),
     col = "steelblue", 
     ylab = "Rate", 
     xlab = "Probability cutoff")

plot(performance(pred_rf_roc, measure = "fpr", x.measure = "cutoff"), 
     add = T, col = "red")

legend(x = 0.6,y = 0.7, c("TPR (Recall)", "FPR (1-Spec)"), 
       lty = 1, col = c('steelblue', 'red'), bty = 'n', cex = 1, lwd = 2)

#abline(v = 0.02, lwd = 2, lty=6)

title("RF-r")

Vertical line at 0.02

3.3.3 Confusion matrix

3.3.3.1 Kappa-optimized

Using desired cut-off: we want to maximize TPR (Sensitivity, Recall)!

According to the TPR/FPR plot (above) the optimal cutoff is

# predict probabilities
pred_prob_rf <- predict(fit_rf, newdata=df_test, type = "prob")

# choose your cut-off
cutoff = 0.01

# turn probabilities into classes
pred_class_rf <- ifelse(pred_prob_rf$yes > cutoff, "yes", "no")

pred_class_rf <- as.factor(pred_class_rf)

cm_rf <- confusionMatrix(data = pred_class_rf, 
                reference = df_test$stroke,
                mode = "everything",
                positive = "yes")

cm_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  295   2
##        yes 627  50
##                                           
##                Accuracy : 0.3542          
##                  95% CI : (0.3241, 0.3852)
##     No Information Rate : 0.9466          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0422          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.96154         
##             Specificity : 0.31996         
##          Pos Pred Value : 0.07386         
##          Neg Pred Value : 0.99327         
##               Precision : 0.07386         
##                  Recall : 0.96154         
##                      F1 : 0.13717         
##              Prevalence : 0.05339         
##          Detection Rate : 0.05133         
##    Detection Prevalence : 0.69507         
##       Balanced Accuracy : 0.64075         
##                                           
##        'Positive' Class : yes             
## 

3.3.3.2 ROC-optimized

# predict probabilities
pred_prob_rf_roc <- predict(fit_rf_roc, newdata = df_test, type = "prob")

# choose your cut-off
cutoff = 0.01

# turn probabilities into classes
pred_class_rf_roc <- ifelse(pred_prob_rf_roc$yes > cutoff, "yes", "no")

pred_class_rf_roc <- as.factor(pred_class_rf_roc)

cm_rf <- confusionMatrix(data = pred_class_rf_roc, 
                reference = df_test$stroke,
                mode = "everything",
                positive = "yes")

cm_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  295   2
##        yes 627  50
##                                           
##                Accuracy : 0.3542          
##                  95% CI : (0.3241, 0.3852)
##     No Information Rate : 0.9466          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0422          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.96154         
##             Specificity : 0.31996         
##          Pos Pred Value : 0.07386         
##          Neg Pred Value : 0.99327         
##               Precision : 0.07386         
##                  Recall : 0.96154         
##                      F1 : 0.13717         
##              Prevalence : 0.05339         
##          Detection Rate : 0.05133         
##    Detection Prevalence : 0.69507         
##       Balanced Accuracy : 0.64075         
##                                           
##        'Positive' Class : yes             
## 

4 AdaBoost

4.1 Training and validation

4.1.1 Kappa-optimized

10-fold CV

set.seed(122)

#cl <- makePSOCKcluster(THREADS)
#registerDoParallel(cl)

fit_adb <- train(stroke ~ ., 
                 data = df_train_smote, 
                 metric = "Kappa", 
                 method = "AdaBoost.M1", 
                 trControl = fit_ctrl_kp10,
                 tuneGrid = expand.grid(
                   .maxdepth = seq(10, 20, 2), 
                   .mfinal = seq(150, 180, 10), 
                   .coeflearn = c("Freund")),
                 verbosity = 0,
                 verbose = FALSE)
# coeflearn=Freund was chosen by automatic grid search, mfinal choice comes from there too

# stop CLuster
stopCluster(cl)

fit_adb
## AdaBoost.M1 
## 
## 5619 samples
##   19 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 5057, 5057, 5056, 5057, 5057, 5057, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  mfinal  Accuracy   Kappa    
##   10        150     0.9690510  0.9381009
##   10        160     0.9689264  0.9378516
##   10        170     0.9690332  0.9380653
##   10        180     0.9688020  0.9376028
##   12        150     0.9688196  0.9376381
##   12        160     0.9688729  0.9377447
##   12        170     0.9687481  0.9374950
##   12        180     0.9687839  0.9375666
##   14        150     0.9689262  0.9378512
##   14        160     0.9691400  0.9382789
##   14        170     0.9690687  0.9381362
##   14        180     0.9690867  0.9381722
##   16        150     0.9689977  0.9379943
##   16        160     0.9689440  0.9378869
##   16        170     0.9688195  0.9376379
##   16        180     0.9689975  0.9379938
##   18        150     0.9692650  0.9385287
##   18        160     0.9692470  0.9384928
##   18        170     0.9693538  0.9387064
##   18        180     0.9693004  0.9385996
##   20        150     0.9695314  0.9390616
##   20        160     0.9694779  0.9389546
##   20        170     0.9693534  0.9387055
##   20        180     0.9692821  0.9385631
## 
## Tuning parameter 'coeflearn' was held constant at a value of Freund
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were mfinal = 150, maxdepth = 20
##  and coeflearn = Freund.

4.2 Testing

4.2.1 ROC curve

# calculate ROC
perf_pred_adb <- get_roc(fit_adb, df_test)
perf_adb <- perf_pred_adb[[1]]
pred_adb <- perf_pred_adb[[2]]

# take AUC 
auc_adb <- round(unlist(slot(performance(pred_adb, measure = "auc"), "y.values")), 3)

# plot
plot(perf_adb, main = "AdaBoost ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_adb))

4.2.2 TPR, FPR vs Probability cutoff

At which probability cut-off, you’ll get TPR = 1.0?

# use pred_rf (pred_roc) object
plot(performance(pred_adb, measure = "tpr", x.measure = "cutoff"),
     col="steelblue", 
     ylab = "Rate", 
     xlab="Probability cutoff")

plot(performance(pred_adb, measure = "fpr", x.measure = "cutoff"), 
     add = T, col = "red")

legend(x = 0.6,y = 0.7, c("TPR (Recall)", "FPR (1-Spec)"), 
       lty = 1, col =c('steelblue', 'red'), bty = 'n', cex = 1, lwd = 2)

#abline(v = 0.1, lwd = 2, lty=6)

title("AdaBoost.M1")

4.2.3 Confusion matrix

pred_prob_adb <- predict(fit_adb, newdata = df_test, type = "prob")

# choose your cut-off
cutoff = 0.12

# turn probabilities into classes
pred_class_adb <- ifelse(pred_prob_adb$yes > cutoff, "yes", "no")

pred_class_adb <- as.factor(pred_class_adb)

cm_adb <- confusionMatrix(data = pred_class_adb, 
                reference = df_test$stroke,
                mode = "everything",
                positive = "yes")

cm_adb
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  231   0
##        yes 691  52
##                                           
##                Accuracy : 0.2906          
##                  95% CI : (0.2622, 0.3202)
##     No Information Rate : 0.9466          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0345          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.00000         
##             Specificity : 0.25054         
##          Pos Pred Value : 0.06999         
##          Neg Pred Value : 1.00000         
##               Precision : 0.06999         
##                  Recall : 1.00000         
##                      F1 : 0.13082         
##              Prevalence : 0.05339         
##          Detection Rate : 0.05339         
##    Detection Prevalence : 0.76283         
##       Balanced Accuracy : 0.62527         
##                                           
##        'Positive' Class : yes             
## 

5 Extreme Gradient Boosting: xgbTree

xgbTree has 7 parameters

5.1 Training and validation

5.1.1 Kappa-optimized

10-fold CV

set.seed(121)

fit_xgb_kp <- train(stroke ~ ., 
                 data = df_train_smote, 
                 method = "xgbTree",
                 metric = "Kappa", 
                 trControl = fit_ctrl_kp10,
                 tuneGrid = expand.grid(
                   .nrounds = 100,
                   .max_depth = seq(3, 15, 1),
                   .eta = 0.3,
                   .gamma = 0.01,
                   .colsample_bytree = 1,
                   .min_child_weight = 1,
                   .subsample = 1
                 ),
                 verbose = FALSE,
                 verbosity = 0)

fit_xgb_kp$bestTune

5.2 Features importance

imp_vars_xgb <- varImp(fit_xgb_kp)

plot(imp_vars_xgb, main = "Variable Importance with XGB")

5.3 Testing

5.3.1 ROC curve

# calculate ROC
perf_pred_xgb <- get_roc(fit_xgb_kp, df_test)
perf_xgb <- perf_pred_xgb[[1]]
pred_xgb <- perf_pred_xgb[[2]]


# take AUC 
auc_xgb <- round(unlist(slot(performance(pred_xgb, measure = "auc"), "y.values")), 3)

# plot
plot(perf_xgb, main = "XGB ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_xgb))

5.3.2 TPR v FPR

# use pred_xgb object
plot(performance(pred_xgb, measure = "tpr", x.measure = "cutoff"),
     col = "steelblue", 
     ylab = "Rate", 
     xlab = "Probability cutoff")

plot(performance(pred_xgb, measure = "fpr", x.measure = "cutoff"), 
     add = T, col = "red")

legend(x = 0.6,y = 0.7, c("TPR (Recall)", "FPR (1-Spec)"), 
       lty = 1, col = c('steelblue', 'red'), bty = 'n', cex = 1, lwd = 2)

#abline(v = 0.1, lwd = 2, lty=6)

title("xgbTree")

5.3.3 Confusion matrix

pred_prob_xgb <- predict(fit_xgb_kp, newdata=df_test, type = "prob")

# choose your cut-off
cutoff = 0.12

# turn probabilities into classes
pred_class_xgb <- ifelse(pred_prob_xgb$yes > cutoff, "yes", "no")

pred_class_xgb <- as.factor(pred_class_xgb)

cm_xgb <- confusionMatrix(data = pred_class_xgb, 
                reference = df_test$stroke,
                mode = "everything",
                positive = "yes")

cm_xgb

6 Save the workspace


save.image("data/workspace.RData")